# R for Data Science Chapter 5
# Exploratory Data Analysis
# Daniel J. Vera, Ph.D.

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(nycflights13)

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut))

diamonds %>%
  count(cut)
## # A tibble: 5 × 2
##         cut     n
##       <ord> <int>
## 1      Fair  1610
## 2      Good  4906
## 3 Very Good 12082
## 4   Premium 13791
## 5     Ideal 21551
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>%
  count(cut_width(carat, 0.5))
## # A tibble: 11 × 2
##    `cut_width(carat, 0.5)`     n
##                     <fctr> <int>
## 1             [-0.25,0.25]   785
## 2              (0.25,0.75] 29498
## 3              (0.75,1.25] 15977
## 4              (1.25,1.75]  5313
## 5              (1.75,2.25]  2002
## 6              (2.25,2.75]   322
## 7              (2.75,3.25]    32
## 8              (3.25,3.75]     5
## 9              (3.75,4.25]     4
## 10             (4.25,4.75]     1
## 11             (4.75,5.25]     1
smaller_diamonds <- diamonds %>%
  filter(carat < 3)

ggplot(data = smaller_diamonds, mapping = aes(x = carat)) + 
  geom_histogram(binwidth = 0.1)

ggplot(data = smaller_diamonds, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.01)

ggplot(data = smaller_diamonds, mapping = aes(x = carat, color = cut)) +
  geom_freqpoly(binwidth = 0.1)

ggplot(data = faithful, mapping = aes(x = eruptions)) +
  geom_histogram(binwdith = 0.25)
## Warning: Ignoring unknown parameters: binwdith
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50))

unusual <- diamonds %>%
  # trying to pick up on unusual values in previous graph
  filter(y < 3 | y > 20) %>% 
  arrange(y) 

# Exercises 7.3.4 on website
#===============================================================
# Explore the distribution of each of the x, y, and z variables 
# in diamonds. What do you learn? Think about a diamond and how 
# you might decide which dimension is the length, width, and depth.
ggplot(data = diamonds) + 
  geom_histogram(mapping = aes(x = x), binwidth = 0.5)

ggplot(data = diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = 0.5)

ggplot(data = diamonds) + 
  geom_histogram(mapping = aes(x = z), binwidth = 0.5)

# Explore the distribution of price. Do you discover anything unusual 
# or surprising? 
# (Hint: Carefully think about the binwidth and make sure you try a 
# wide range of values.)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_histogram(binwidth = 100)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_histogram(binwidth = 100) +
  coord_cartesian(xlim = c(0, 5000))

diamonds %>% 
  filter(price < 1500) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 20010
diamonds %>% 
  filter(price > 1500) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 33930
# How many diamonds are 0.99 carat? How many are 1 carat? 
# What do you think is the cause of the difference?

ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram(binwidth = .01) +
  coord_cartesian(xlim = c(0.90, 1.1))

diamonds %>% 
  filter(carat == 1) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1558
diamonds %>% 
  filter(carat == 0.99) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    23
# most likely the 0.99 are ones made with the intention of being
# 1 carat and there is some error that makes it 0.99 or there is 
# a measurement error another reason from online solutions:
# Around 1500 diamonds are 1.001.00 carat, compared to around 30 
# or so diamonds at .99.99 carat. This could occur because prospective 
# buyers of diamonds, if they are already willing to buy a .99.99 carat 
# diamond will decide it is more aesthetically pleasing to say they 
# bought a 11 carat diamond so there is less demand for .99.99 carat 
# diamonds.

# Compare and contrast coord_cartesian() vs xlim() or ylim() when 
# zooming in on a histogram. What happens if you leave binwidth unset? 
# What happens if you try and zoom so only half a bar shows?
ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram(binwidth = .01) +
  coord_cartesian(xlim = c(0.90, 1.1))

ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram(binwidth = .01) +
  xlim(0.90, 1.1)
## Warning: Removed 43609 rows containing non-finite values (stat_bin).

ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram() +
  coord_cartesian(xlim = c(0.90, 1.1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = carat)) +
  geom_histogram() +
  xlim(0.90, 1.1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 43609 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

#===============================================================
diamonds2 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values (geom_point).

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point(na.rm = TRUE)

# Exercises 7.4.1 on website
#===============================================================
# What happens to missing values in a histogram? What happens to 
# missing values in a bar chart? Why is there a difference?
ggplot(data = flights, mapping = aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8255 rows containing non-finite values (stat_bin).

flights %>%
  mutate(carrier = ifelse(carrier == "AA", NA, carrier)) %>%
  ggplot(aes(carrier)) +
  geom_bar()

# Histograms omit missing values, bar charts draw them as another
# category

# What does na.rm = TRUE do in mean() and sum()?
# It removes the NA missing values before computing.
#===============================================================

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut)) +
  geom_bar()

ggplot(
  data = diamonds,
  mapping = aes(x = price, y = ..density..)
  ) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

ggplot(data = mpg) + 
  geom_boxplot(
    mapping = aes(
      x = reorder(class, hwy, FUN = median), 
      y = hwy
      )
    ) +
  coord_flip()

# Exercises 7.5.1.1 on website
#===============================================================
# Use what you’ve learned to improve the visualisation of the 
# departure times of cancelled vs. non-cancelled flights.

nycflights13::flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot(mapping = aes(sched_dep_time, y = ..density..)) + 
  geom_freqpoly(
    mapping = aes(colour = cancelled), 
    size = 1.5, 
    binwidth = 1/4)

# What variable in the diamonds dataset is most important for predicting the   
# price of a diamond? How is that variable correlated with cut? Why does the 
# combination of those two relationships lead to lower quality diamonds 
# being more expensive?

diamonds_numeric <- diamonds[-c(2, 3, 4)]
cor(diamonds_numeric)
##            carat       depth      table      price           x           y
## carat 1.00000000  0.02822431  0.1816175  0.9215913  0.97509423  0.95172220
## depth 0.02822431  1.00000000 -0.2957785 -0.0106474 -0.02528925 -0.02934067
## table 0.18161755 -0.29577852  1.0000000  0.1271339  0.19534428  0.18376015
## price 0.92159130 -0.01064740  0.1271339  1.0000000  0.88443516  0.86542090
## x     0.97509423 -0.02528925  0.1953443  0.8844352  1.00000000  0.97470148
## y     0.95172220 -0.02934067  0.1837601  0.8654209  0.97470148  1.00000000
## z     0.95338738  0.09492388  0.1509287  0.8612494  0.97077180  0.95200572
##                z
## carat 0.95338738
## depth 0.09492388
## table 0.15092869
## price 0.86124944
## x     0.97077180
## y     0.95200572
## z     1.00000000
lm_diamonds = lm(price ~ ., data = diamonds)
summary(lm_diamonds)
## 
## Call:
## lm(formula = price ~ ., data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21376.0   -592.4   -183.5    376.4  10694.2 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  5753.762    396.630   14.507  < 2e-16 ***
## carat       11256.978     48.628  231.494  < 2e-16 ***
## cut.L         584.457     22.478   26.001  < 2e-16 ***
## cut.Q        -301.908     17.994  -16.778  < 2e-16 ***
## cut.C         148.035     15.483    9.561  < 2e-16 ***
## cut^4         -20.794     12.377   -1.680  0.09294 .  
## color.L     -1952.160     17.342 -112.570  < 2e-16 ***
## color.Q      -672.054     15.777  -42.597  < 2e-16 ***
## color.C      -165.283     14.725  -11.225  < 2e-16 ***
## color^4        38.195     13.527    2.824  0.00475 ** 
## color^5       -95.793     12.776   -7.498 6.59e-14 ***
## color^6       -48.466     11.614   -4.173 3.01e-05 ***
## clarity.L    4097.431     30.259  135.414  < 2e-16 ***
## clarity.Q   -1925.004     28.227  -68.197  < 2e-16 ***
## clarity.C     982.205     24.152   40.668  < 2e-16 ***
## clarity^4    -364.918     19.285  -18.922  < 2e-16 ***
## clarity^5     233.563     15.752   14.828  < 2e-16 ***
## clarity^6       6.883     13.715    0.502  0.61575    
## clarity^7      90.640     12.103    7.489 7.06e-14 ***
## depth         -63.806      4.535  -14.071  < 2e-16 ***
## table         -26.474      2.912   -9.092  < 2e-16 ***
## x           -1008.261     32.898  -30.648  < 2e-16 ***
## y               9.609     19.333    0.497  0.61918    
## z             -50.119     33.486   -1.497  0.13448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1130 on 53916 degrees of freedom
## Multiple R-squared:  0.9198, Adjusted R-squared:  0.9198 
## F-statistic: 2.688e+04 on 23 and 53916 DF,  p-value: < 2.2e-16
lm_diamonds_carat = lm(price ~ carat, data = diamonds)
summary(lm_diamonds_carat)
## 
## Call:
## lm(formula = price ~ carat, data = diamonds)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18585.3   -804.8    -18.9    537.4  12731.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2256.36      13.06  -172.8   <2e-16 ***
## carat        7756.43      14.07   551.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1549 on 53938 degrees of freedom
## Multiple R-squared:  0.8493, Adjusted R-squared:  0.8493 
## F-statistic: 3.041e+05 on 1 and 53938 DF,  p-value: < 2.2e-16
# carat looks like the most highly correlated.
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = diamonds, mapping = aes(x = cut, y = carat)) + 
  geom_boxplot()

# it looks like on average, ideal cuts have lower carat and since
# carat is variable with highest correlation to price, ideal cuts
# will be cheaper, on average.

# Install the ggstance package, and create a horizontal boxplot. 
# How does this compare to using coord_flip()?
ggplot(data = diamonds, mapping = aes(x = cut, y = carat)) + 
  geom_boxplot() +
  coord_flip()

# install.packages("ggstance") 
library(ggstance) # should go in beginning but see next comment
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
#bad form generally in scripts to install packages but for HW
ggplot(data = diamonds, mapping = aes(x = carat, y = cut)) + 
  geom_boxploth()

# To create a horizontal layer in ggplot2 with coord_flip(), you 
# have to supply aesthetics as if they were to be drawn vertically
# then flip with coord_flip()
# In ggstance you do it as if drawn horizontally, see above.

# One problem with boxplots is that they were developed in an era of 
# much smaller datasets and tend to display a prohibitively large number 
# of “outlying values”. One approach to remedy this problem is the 
# letter value plot. 
# Install the lvplot package, and try using geom_lv() to display the 
# distribution of price vs cut. What do you learn? How do you interpret 
# the plots?

#install.packages("lvplot") # see above comment on installing packages
library(lvplot) # should at begining but see above

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + 
  geom_boxplot()

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + 
  geom_lv()

# Compare and contrast geom_violin() with a facetted geom_histogram(), 
# or a coloured geom_freqpoly().
# What are the pros and cons of each method?

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = price)) + 
  facet_grid(cut ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, 
       mapping = aes(x = price, y = ..density.., color = cut)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_violin(mapping = aes(x = cut, y = price)) + 
  coord_flip()

# If you have a small dataset, it’s sometimes useful to use geom_jitter() 
# to see the relationship between a continuous and categorical variable. 
# The ggbeeswarm package provides a number of methods similar to 
# geom_jitter(). List them and briefly describe what each one does.
#install.packages("ggbeeswarm")
library(ggbeeswarm)
?ggbeeswarm

ggplot(data = mpg) +
  geom_quasirandom(mapping = aes(x = reorder(class, hwy, FUN = median),
                                 y = hwy))

ggplot(data = mpg) +
  geom_beeswarm(mapping = aes(x = reorder(class, hwy, FUN = median),
                              y = hwy))

# The beeswarm geom is a convenient means to offset points within categories 
# to reduce overplotting. Uses the beeswarm package

# The quasirandom geom is a convenient means to offset points within 
# categories to reduce overplotting. Uses the vipor package.

# Both similar to geom_jitter()
#===============================================================
ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))

diamonds %>%
  count(color, cut) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = n))

# Exercises 7.5.2.1 on website
#===============================================================
# How could you rescale the count dataset above to more clearly show 
# the distribution of cut within color, or color within cut?

# calculate proportion
diamonds %>%
  count(cut, color) %>%
  group_by(color) %>%
  mutate(prop = n/sum(n)) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = prop))

diamonds %>%
  count(cut, color) %>%
  group_by(cut) %>%
  mutate(prop = n/sum(n)) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
  geom_tile(mapping = aes(fill = prop))

# Use geom_tile() together with dplyr to explore how average flight 
# delays vary by destination and month of year. 
# What makes the plot difficult to read? How could you improve it?

flights %>%
  group_by(month, dest) %>%
  mutate(avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
  ggplot(mapping = aes(x = factor(month), y = dest)) +
  geom_tile(mapping = aes(fill = avg_delay)) + 
  labs(x = "Month", y = "Destination", fill = "Average Delay")

# Why is it slightly better to use aes(x = color, y = cut) rather than 
# aes(x = cut, y = color) in the example above?

diamonds %>% 
  count(color, cut) %>% 
  ggplot(mapping = aes(x = color, y = cut)) + 
  geom_tile(mapping = aes( fill = n))

diamonds %>% 
  count(color, cut) %>% 
  ggplot(mapping = aes(x = cut, y = color)) + 
  geom_tile(mapping = aes( fill = n))

# easier to read longer labels in y axis
# from solutions on web:
# Another justification, for switching the order is that the larger 
# numbers are at the top when x = color and y = cut, and that lowers 
# the cognitive burden of interpreting the plot.
#===============================================================
ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) +
  geom_point(
    mapping = aes(x = carat, y = price),
    alpha = 1/100
    )

ggplot(data = smaller_diamonds) + 
  geom_bin2d(mapping = aes(x = carat, y = price))

#install.packages("hexbin")
library(hexbin)
ggplot(data = smaller_diamonds) +
  geom_hex(mapping = aes(x = carat, y = price))

ggplot(data = smaller_diamonds, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))

ggplot(data = smaller_diamonds, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_number(carat, 20))) 

# Exercises 7.5.3.1 on website
#===============================================================
# Instead of summarising the conditional distribution with a boxplot, 
# you could use a frequency polygon. What do you need to consider 
# when using cut_width() vs cut_number()? How does that impact a 
# visualisation of the 2d distribution of carat and price?
ggplot(data = diamonds) +
  geom_freqpoly(
    mapping = aes(x = price, color = cut_width(carat, 0.3))
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_freqpoly(
    mapping = aes(x = price, 
                  y = ..density..,
                  color = cut_width(carat, 0.3))
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_freqpoly(
    mapping = aes(x = price, color = cut_number(carat, 10))
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_freqpoly(
    mapping = aes(x = price, 
                  y = ..density..,
                  color = cut_number(carat, 10))
  )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Visualise the distribution of carat, partitioned by price.
ggplot(data = diamonds, mapping = aes(x = cut_number(price, 10), y = carat)) +
  geom_boxplot() +
  coord_flip() +
  xlab("Price")

ggplot(diamonds, aes(x = cut_width(price, 2000, boundary = 0), y = carat)) +
  geom_boxplot(varwidth = TRUE) +
  coord_flip() +
  xlab("Price")

# How does the price distribution of very large diamonds compare to 
# small diamonds. Is it as you expect, or does it surprise you?

ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_width(carat, .3))) 

# More variance in large diamonds. 
 
# Combine two of the techniques you’ve learned to visualise the combined 
# distribution of cut, carat, and price.

# Instead of summarizing the conditional distribution with a boxplot, 
# you could use a frequency polygon. What do you need to consider when 
# using cut_width() versus cut_number()? How does that impact a 
# visualization of the 2D distribution of carat and price? 


ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price)) + 
  geom_boxplot(mapping = aes(group = cut_width(carat, 0.3)))

ggplot(data = diamonds, 
       mapping = aes(x = carat, y = price)) + 
  geom_boxplot(mapping = aes(group = cut_number(carat, 10)))

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut_width(carat, 0.3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# You need to pay attention to the width of carat you are looking at
# or the number of bins. 

# Visualize the distribution of carat, partitioned by price. 
ggplot(data = diamonds) +
  geom_boxplot(mapping = aes(x = cut_number(price, 10), y = carat)) +
  coord_flip()

ggplot(data = diamonds) +
  geom_lv(mapping = aes(x = cut_number(price, 10), y = carat)) +
  coord_flip()

# How does the price distribution of very large diamonds compare to 
# small diamonds. Is it as you expect, or does it surprise you? 

# Combine two of the techniques you’ve learned to visualize the combined 
# distribution of cut, carat, and price. 


# Two-dimensional plots reveal outliers that are not visible in 
# one-dimensional plots. For example, some points in the following plot 
# have an unusual combination of x and y values, which makes the points
# outliers even though their x and y values appear normal when examined 
# separately: 

ggplot(data = diamonds) + 
  geom_point( mapping = aes(x = x, y = y)) + 
  coord_cartesian( xlim = c(4, 11), ylim = c(4, 11)) 

# Why is a scatterplot a better display than a binned plot for this case?